library(dplyr)
library(gtsummary)
library(fastDummies)
library(readstata13)
library(modelsummary)
library(data.table)
library(splitstackshape)
library(dplyr)
library(fastDummies)
library(rdrobust)
library(lubridate) 
library(kableExtra)

#working directory
  setwd("") 

#input stata dataset and save a copy
  raw <- read.dta13("")
  base<-as.data.table(raw)
  
#dataset for analyzing hypertension treatment
  data_treatment<-data.frame(read.csv("carrs_full.csv"))
  data_treatment<-as.data.table(data_treatment)
  
#dataset for analyzing hypertension diagnosis 
  data_diagnosis<-data.frame(read.csv("carrs_selected.csv"))
  data_diagnosis<-as.data.table(data_diagnosis)
  
###########################################################################################
# General preparations
###########################################################################################
  
#sort ids by year of interview
  base<-base[order(hhp_id,pid,entryd)]
  
#creating a variable with year of survey and months of survey
  fnames<- grep("entry", names(base), value=T)
  for (i in fnames) {
    temp <- tstrsplit(base[[i]], "-")
    set(base, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
  }
  #removing day variables
    base<-base[,c("entryd_3"  ,    "entryd_fup1_3",
                  "entryd_fup2_3", "entryd_fup3_3", "entryd_fup4_3" ,"entryd_fup5_3"):=NULL] 
    
    rm(fnames,temp,i,raw)
  
  #renaming year and month variables; saving as numeric
    base<-setnames(base,old=c("entryd_1", "entryd_2" ,   
                              "entryd_fup1_1" ,  "entryd_fup1_2",     
                              "entryd_fup2_1" ,  "entryd_fup2_2", 
                              "entryd_fup3_1" ,  "entryd_fup3_2", 
                              "entryd_fup4_1" ,  "entryd_fup4_2",      
                              "entryd_fup5_1" ,  "entryd_fup5_2" ), new=c("year_0", "month_0",
                                                                          "year_1", "month_1",
                                                                          "year_2", "month_2",
                                                                          "year_3", "month_3",
                                                                          "year_4", "month_4",
                                                                          "year_5", "month_5"))
    numer <- c("year_0", "month_0",
               "year_1", "month_1",
               "year_2", "month_2",
               "year_3", "month_3",
               "year_4", "month_4",
               "year_5", "month_5")


  base[, numer] <- base[, lapply(.SD, as.numeric), .SDcols = numer]
  base<-base[,w01:=ifelse(is.na(year_0)=="FALSE" & is.na(year_1)=="FALSE",1,0)] # baseline and follow up 1
  base<-base[,w23:=ifelse(is.na(year_2)=="FALSE" & is.na(year_3)=="FALSE",1,0)] # follow up 2 and follow up 3
  base<-base[,w45:=ifelse(is.na(year_4)=="FALSE" & is.na(year_5)=="FALSE",1,0)]  # follow up 4 and follow up 5
  rm(numer)


# female variable
  base<-base[,female:=ifelse(pd_sex=="Female",1,ifelse(pd_sex=="Male",0,NA))]

# religion variable  
  base<-base[, religion:= ifelse(pd_relig=="Hindu",0,ifelse(pd_relig=="Muslim",1,2))]
  base$religion <- factor(base$religion,labels =c("Hindu","Muslim","Others"))
  
################################################################################
# Function for summary tables
################################################################################
  tables <- function(data1, data2, a) {

    data1 <- as.data.table(data1)
     
     # prior diagnosis of cardiometabolic disease
      data1<-data1[wave==1,diabetesknow:=ifelse(pd_diabetes=="Yes",1,ifelse(pd_diabetes=="No",0,NA))] # individual reports diabetes diagnosis at baseline 
      data1<-data1[wave==1,heartknow:=ifelse(pd_heart=="Yes",1,ifelse(pd_heart=="No",0,NA))] # individual reports heart attack at baseline
      data1<-data1[wave==1,strokeknow:=ifelse(pd_stroke=="Yes",1,ifelse(pd_stroke=="No",0,NA))] # individual reports stroke at baseline
      data1<-data1[wave==1,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline
      data1<-data1[wave==3,diabetesknow:=ifelse(dia_dis=="Yes",1,ifelse(dia_dis=="No",0,NA))] # individual reports diabetes diagnosis at baseline
      data1<-data1[wave==3,heartknow:=ifelse(hrt_dis=="Yes",1,ifelse(hrt_dis=="No",0,NA))] # individual reports heart attack at baseline
      data1<-data1[wave==3,strokeknow:=ifelse(stroke=="Yes",1,ifelse(stroke=="No",0,NA))] # individual reports stroke at baseline
      data1<-data1[wave==3,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline
      data1<-data1[wave==2,diabetesknow:=ifelse(mh_diab=="Yes" | pd_diabetes=="Yes" ,1,ifelse(mh_diab=="No",0,NA))] # individual reports diabetes diagnosis at baseline 
      data1<-data1[wave==2,heartknow:=ifelse(mh_heart=="Yes" | pd_heart=="Yes",1,ifelse(mh_heart=="No",0,NA))] # individual reports heart attack at baseline
      data1<-data1[wave==2,strokeknow:=ifelse(mh_stroke=="Yes" | pd_heart=="Yes",1,ifelse(mh_stroke=="No",0,NA))] # individual reports stroke at baseline
      data1<-data1[wave==2,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline
      
    # age categories (<40 and 40+) 
      data1 <- data1[, agecat:= ifelse(age_calc < 40, 0, ifelse(age_calc >= 40, 1, NA))]
      data1$agecatf <- factor(data1$agecat)
      
    # level of education 
      # The labels are wrong. Primary school is actually "uncompleted primary school", 
      # High school refers to "secondary school completed". Secondary School / intermediary refers to High School. 
      # This can be verified by looking at the years of schooling in each of the categories.
      data1 <- data1[, edu:= ifelse(pd_edu_stat == "Illiterate" | pd_edu_stat == "Literate" | pd_edu_stat == "Primary school", 0,
                                  ifelse(pd_edu_stat == "High school",1,
                                         ifelse(pd_edu_stat == "Secondary school/ Intermediary" | pd_edu_stat == "Graduate" | pd_edu_stat == "Professional degree/post graduate", 2, NA)))]
      data1 <- data1[is.na(edu)=="FALSE"]
      data1$eduf <- factor(data1$edu, labels = c("Primary","Secondary","Tertiary"))

    # keeping relevant variables 
      data1_selected <- select(data1, wave,diabetesknow ,eduf, heartknow, strokeknow, age_calc, female, religion, agecat, edu,agecatf,)
      data1_selected <- dummy_cols(data1_selected)
  
    # generate categorical wave identifier
      data2 <- data2 %>%
        mutate(wave = case_when(
          w01 == 1 ~ 1,
          w23 == 1 ~ 2,
          w45 == 1 ~ 3,
          TRUE ~ NA_real_
        ))
      data2 <- as.data.table(data2)
    
    # keeping relevant variables and bringing them into the right format
      data2_selected <- select(data2, diabetesknow , wave,heartknow, strokeknow,wave,  age_calc, female, religion, agecat, edu)
      data2_selected$agecatf <- factor(data2_selected$agecat)
      data2_selected$eduf <- factor(data2_selected$edu, labels = c("Primary","Secondary","Tertiary"))
      data2_selected <- dummy_cols(data2_selected)
    
    # Add a variable indicating the dataset (for comparison)
      data2_selected$dataset <- "Overall Sample"
      data1_selected$dataset <- "Attrition Sample"
    
    # Combine the datasets
      combined_data <- bind_rows(data1_selected, data2_selected)
      combined_data <-combined_data %>%   select(-religion,-agecat,-agecatf,-edu,-eduf)
      combined_data <-combined_data 
  
    # Create a summary table using gtsummary
      tout <<- combined_data %>% filter(wave==a) %>%
        select("female","age_calc","agecatf_0",  "agecatf_1", 
               "religion_Hindu", "religion_Muslim", "religion_Others", 
               "eduf_Primary", "eduf_Secondary", 
               "eduf_Primary", "eduf_Tertiary",
               "diabetesknow","heartknow","strokeknow","dataset") %>%
        tbl_summary(
          by = dataset,
          statistic = list(all_continuous() ~ "{mean} ({sd})",
                           all_categorical() ~ "{p}%"),
          digits = list(all_categorical() ~ c(1, 1),
                        all_continuous() ~ c(0, 0)),
          missing = "no",
          label = c(total = "Number",
                    age_calc = "Mean age, years (SD)",
                    religion_Muslim = "Muslim ",
                    religion_Others = "Other religions",
                    agecatf_0 = "Age <40 years",
                    agecatf_1 = "Age >40 years",
                    religion_Hindu = "Hindu",
                    eduf_Primary = "Primary school",
                    eduf_Secondary = "Secondary school",
                    eduf_Tertiary = "Tertiary school",
                    diabetesknow = "Suffers from diabetes",
                    heartknow = "Suffered a heart attack ",
                    strokeknow = "Suffered a stroke",
                    female = "Female"
          )
        ) %>%
        add_p(
          # perform t-test for all variables
          test = everything() ~ "t.test",
          # assume equal variance in the t-test
          test.args = all_tests("t.test") ~ list(var.equal = TRUE)
        )
  }
################################################################################
# Attrition in Pair 1
################################################################################

# datasets that only contain individuals from pair 1
  data_diagnosis1<-data_diagnosis[w01==1]
  data_treatment1<-data_treatment[w01==1]
  
# keep everyone who is in the pair baseline
  base1<-base[is.na(year_0)=="FALSE",]

# keep everyone aged 30 and older
  # age variable at baseline (i.e. wave0)
  base1 <- base1[, age_calc:= trunc((pd_dob %--% entryd) / years(1))]
  #! I move the code from above to be specific to the this pair
  
  base1<-base1[age_calc>=30,] 
  
# drop 1 observation with missing sex
  base1<-na.omit(base1, cols=(c("female"))) 

# generate pair indicator
  base1$wave<-1

# generate variable for respondent reports prior diagnosis at pair baseline
  base1<-base1[,hypknow:=ifelse(pd_hbp=="Yes",1,ifelse(pd_hbp=="No",0,NA))]
  
# generate variable for respondent took medication at pair baseline
  base1<-base1[is.na(hypknow)=="FALSE" & hbp_trt_allopdrug=="Yes", hyptreatknow:=1]
  base1<-base1[is.na(hypknow)=="FALSE" & is.na(hyptreatknow)=="TRUE", hyptreatknow:=0] 

# drop respondents with missing BP measurements at pair baseline
  base1<-base1[ is.na(systolic_bp_second)=="FALSE" & is.na(systolic_bp_first)=="FALSE",]
  base1<-base1[ is.na(diastolic_bp_second)=="FALSE" & is.na(diastolic_bp_first)=="FALSE",]
  
# Dataset for diagnosis outcome: Keep individuals who do not report prior diagnosis at pair baseline
    base1know<-base1[hypknow==0]

# Perform an anti-join to select observations in dataset 1 (all eligible individuals) but not in dataset 2 (those who ended up in final sample)
  data_attrition <- anti_join(base1know, data_diagnosis1, by = "pid")
  
# Dataset for treatment outcome: Keep individuals who do not report taking medication at pair baseline
  base1treat<-base1[hyptreatknow==0 ]

# Perform an anti-join to select observations in dataset 1 (all eligible individuals) but not in dataset 2 (those who ended up in final sample)
  data_attrition_treat <- anti_join(base1treat, data_treatment1, by = "pid")

# Output table, outcome: diagnosis
  tables(data_attrition,data_diagnosis1,1)
  as_kable_extra(tout, format = "latex",booktabs = T,  escape = FALSE,
                 linesep = '', caption = "Characteristics of the overall and attrition samples, outcome: hypertension diagnosis, Pair 1")  %>%
    kable_styling(position = "center",latex_options = "HOLD_position")%>%
    footnote(general=c("The overall sample consists of individuals meeting the eligibility criteria at the baseline and were also interviewed at the follow-up wave. The attrition sample consists of individuals meeting the eligibility criteria at the baseline but were lost to follow-up.", "Abbreviation: SD = standard deviation."), general_title="",threeparttable = T, fixed_small_size = T) %>%
    save_kable(file = "attrtion_diagnosis_wave01.tex",format="latex")
  
  rm(tout,data_attrition)
  
# Output table, outcome: treatment
  tables(data_attrition_treat,data_treatment1,1)   
  as_kable_extra(tout, format = "latex",booktabs = T,  escape = FALSE,
                 linesep = '', caption = "Characteristics of the overall and attrition samples, outcome: hypertension treatment, Pair 1")  %>%
    kable_styling(position = "center",latex_options = "HOLD_position")%>%
    footnote(general=c("The overall sample consists of individuals meeting the eligibility criteria at the baseline and were
                        also interviewed at the follow-up wave. The attrition sample consists of individuals meeting the eligibility criteria 
                        at the baseline but were lost to follow-up.", "Abbreviation: SD = standard deviation."), general_title="",threeparttable = T, fixed_small_size = T) %>%
    save_kable(file = "attrtion_treatment_wave01.tex",format="latex")
  rm(tout,data_attrition_treat)

################################################################################
# Attrition in Pair 2
################################################################################

# datasets that only contain individuals from pair 2
  data_diagnosis2<-data_diagnosis[w23==1]
  data_treatment2<-data_treatment[w23==1]
  
# keep everyone who is in the pair baseline
  base2<-base[is.na(year_2)=="FALSE",]
  
# keep everyone aged 30 and older
  base2 <- base2[, age_calc:= trunc((pd_dob %--% entryd_fup2) / years(1))]
  base2<-base2[age_calc>=30,]

# drop 1 observation with missing sex
  base2<-na.omit(base2, cols=(c("female")))  

  
# generate pair indicator
  base2$wave<-2

# generate variable for respondent reports prior diagnosis at pair baseline
  base2<-base2[ (mh_hbp=="Yes"| pd_hbp=="Yes" | fo_hypert=="Yes"),hypknow:=1] 
  base2<-base2[ is.na(hypknow)=="TRUE",hypknow:=0] 
  base2<-base2[ (is.na(mh_hbp)=="TRUE" & is.na(pd_hbp)=="TRUE" & is.na(fo_hypert)=="TRUE"),hypknow:=NA]

# generate variable for respondent took medication at pair baseline 
  base2<-base2[hbp_allopathic=="Yes" , hyptreatknow:=1] #respondents who were taking bp medication in baseline of this pair/follow up 2 or wave 3 
  base2<-base2[(is.na(hyptreatknow)=="TRUE" & is.na(hypknow)=="FALSE" ) , hyptreatknow:=0] #respondents who  were taking bp medication  in w01line 
  
# drop respondents with missing BP measurements at pair baseline
  base2<-base2[is.na(sbp_fu2_2)=="FALSE" & is.na(sbp_fu2_1)=="FALSE",]
  base2<-base2[is.na(dbp_fu2_2)=="FALSE" & is.na(dbp_fu2_1)=="FALSE",]       

# Dataset for diagnosis outcome: Keep individuals who do not report prior diagnosis at pair baseline
  base2know<-base2[hypknow==0]
  
# Perform an anti-join to select observations in dataset 1 (all eligible individuals) but not in dataset 2 (those who ended up in final sample)
  data_attrition <- anti_join(base2know, data_diagnosis2, by = "pid")

# Dataset for treatment outcome: Keep individuals who do not report taking medication at pair baseline
  base2treat<-base2[hyptreatknow==0]
  
# Perform an anti-join to select observations in dataset 1 (all eligible individuals) but not in dataset 2 (those who ended up in final sample)
  data_attrition_treat <- anti_join(base2treat, data_treatment2, by = "pid")
  

# Output table, outcome: diagnosis
  tables(data_attrition,data_diagnosis2,2)
  as_kable_extra(tout, format = "latex",booktabs = T,  escape = FALSE,
                 linesep = '', caption = "Characteristics of the overall and attrition samples, outcome: hypertension diagnosis, Pair 2")  %>%
    kable_styling(position = "center",latex_options = "HOLD_position")%>%
    footnote(general=c("The overall sample consists of individuals meeting the eligibility criteria at the baseline and were also interviewed at the follow-up wave. The attrition sample consists of individuals meeting the eligibility criteria at the baseline but were lost to follow-up.", "Abbreviation: SD = standard deviation."), general_title="",threeparttable = T, fixed_small_size = T) %>%
    save_kable(file = "attrtion_diagnosis_wave23.tex",format="latex")
  rm(tout,data_attrition)

# Output table, outcome: treatment
  tables(data_attrition_treat,data_treatment2,2)
  as_kable_extra(tout, format = "latex",booktabs = T,  escape = FALSE,
                 linesep = '', caption = "Characteristics of the overall and attrition samples, outcome: hypertension treatment, Pair 2")  %>%
    kable_styling(position = "center",latex_options = "HOLD_position")%>%
    footnote(general=c("The overall sample consists of individuals meeting the eligibility criteria at the baseline and were also interviewed at the follow-up wave. The attrition sample consists of individuals meeting the eligibility criteria at the baseline but were lost to follow-up.", "Abbreviation: SD = standard deviation."), general_title="",threeparttable = T, fixed_small_size = T) %>%
    save_kable(file = "attrtion_treatment_wave23.tex",format="latex")
  rm(tout,data_attrition_treat)


################################################################################
# Attrition in Pair 3
################################################################################

# datasets that only contain individuals from pair 3
  data_diagnosis3<-data_diagnosis[w45==1]
  data_treatment3<-data_treatment[w45==1]

# keep everyone who is in the pair baseline
  base3<-base[is.na(year_4)=="FALSE",]
  
# keep everyone aged 30 and older
  base3 <- base3[, age_calc:= trunc((pd_dob %--% entryd_fup4) / years(1))]
  base3<-base3[age_calc>=30,] 

# drop 1 observation with missing sex
  base3<-na.omit(base3, cols=(c("female")))  
  
# generate pair indicator
  base3$wave<-3
  
# generate variable for respondent reports prior diagnosis at pair baseline
  base3<-base3[mh_hbp=="Yes"| pd_hbp=="Yes" | fo_hypert=="Yes" | hbp_dis=="Yes",hypknow:=1] 
  base3<-base3[is.na(hypknow)=="TRUE",hypknow:=0] 
  base3<-base3[is.na(mh_hbp)=="TRUE" & is.na(pd_hbp)=="TRUE" & is.na(fo_hypert)=="TRUE" & is.na(hbp_dis=="Yes")=="TRUE",hypknow:=NA]
  
# generate variable for respondent took medication at pair baseline  
  base3<-base3[hbp_trt_all=="Yes", hyptreatknow:=1] #respondents who were taking bp medication in the baseline of this pair
  base3<-base3[(is.na(hyptreatknow)=="TRUE" & is.na(hypknow)=="FALSE" ) , hyptreatknow:=0] #respondents who were taking bp medication in the basline of this pair
  
# drop respondents with missing BP measurements at pair baseline
  base3<-base3[is.na(bp_s2)=="FALSE" & is.na(bp_s1)=="FALSE",]
  base3<-base3[is.na(bp_d2)=="FALSE" & is.na(bp_d1)=="FALSE",]

# Dataset for diagnosis outcome: Keep individuals who do not report prior diagnosis at pair baseline
  base3know<-base3[hypknow==0]

# Perform an anti-join to select observations in dataset 1 (all eligible individuals) but not in dataset 2 (those who ended up in final sample)
  data_attrition <- anti_join(base3know, data_diagnosis3, by = "pid")

# Dataset for treatment outcome: Keep individuals who do not report taking medication at pair baseline
  base3treat<-base3[hyptreatknow==0]

# Perform an anti-join to select observations in dataset 1 (all eligible individuals) but not in dataset 2 (those who ended up in final sample)
  data_attrition_treat <- anti_join(base3treat, data_treatment3, by = "pid")

# Output table, outcome: diagnosis
  tables(data_attrition,data_diagnosis3,3)
  as_kable_extra(tout, format = "latex",booktabs = T,  escape = FALSE,
                 linesep = '', caption = "Characteristics of the overall and attrition samples, outcome: hypertension diagnosis, Pair 3")  %>%
    kable_styling(position = "center",latex_options = "HOLD_position")%>%
    footnote(general=c("The overall sample consists of individuals meeting the eligibility criteria at the baseline and were also interviewed at the follow-up wave. The attrition sample consists of individuals meeting the eligibility criteria at the baseline but were lost to follow-up.", "Abbreviation: SD = standard deviation."), general_title="",threeparttable = T, fixed_small_size = T) %>%
    save_kable(file = "attrtion_diagnosis_wave45.tex",format="latex")
  rm(tout,data_attrition)

# Output table, outcome: treatment
  tables(data_attrition_treat,data_treatment3,3)
  as_kable_extra(tout, format = "latex",booktabs = T,  escape = FALSE,
                 linesep = '', caption = "Characteristics of the overall and attrition samples, outcome: hypertension treatment, Pair 3")  %>%
    kable_styling(position = "center",latex_options = "HOLD_position")%>%
    footnote(general=c("The overall sample consists of individuals meeting the eligibility criteria at the baseline and were also interviewed at the follow-up wave. The attrition sample consists of individuals meeting the eligibility criteria at the baseline but were lost to follow-up.", "Abbreviation: SD = standard deviation."), general_title="",threeparttable = T, fixed_small_size = T) %>%
    save_kable(file = "attrtion_treatment_wave45.tex",format="latex")
  rm(tout,data_attrition_treat)
